#!/bin/bash
# **************************************************************************
# Function:  Wrapper that helps launching Intel MPI jobs within Slurm
#            using MICs in native mode.
#            mpiexec.hydra needs passwordless ssh access to all involved nodes
# Version:   0.4
#---------------------------------------------------------------------------
# 11.10.2013 Created by Chrysovalantis Paschoulas, Juelich Supercomputing Centre - Forschungszentrum Juelich
# Intial Script by (C) Olli-Pekka Lehto - CSC IT Center for Science Ltd.
# **************************************************************************

# Usage message
USAGE="
USAGE 
  $(basename "$0") [ [-h] | [-v] [-x <host num tasks> -c <host binary>] [-z <mic num tasks> -m <mic binary>] ]

OPTIONS
  -h       Print this message.
  -c       Binary that will run on host nodes. If it is not set then only the MICs will be used.
  -m       Binary that will run inside the MICs.
  -x       Number of tasks (MPI ranks) for the host nodes. Default value is 1.
  -z       Number of tasks (MPI ranks) for the MICs. Default value is 1.
  -v       Show more info for this script.
  --tv     Run using TotalView (equivalent to export MPIEXEC_PREFIX=\"totalview -args\").
  --tvcli  Run using TotalView cli (equivalent to export MPIEXEC_PREFIX=\"totalviewcli -args\")

MORE INFO
  The user MUST export the following environment variables:
    MIC_NUM_PER_HOST     Number of MICs on each host that will be used by mpiexec. Available options: 0, 1, 2. Default 2.
    OMP_NUM_THREADS      OpenMP threads number per task on hosts. This MUST be exported when OpenMP is used!
    MIC_OMP_NUM_THREADS  OpenMP threads number per task on MICs. If not defined then is set same as OMP_NUM_THREADS.

  Also the user MAY pass additional flags to mpiexec exporting the following env vars: 
    MPIEXEC_PREFIX       Wrap the execution of mpiexec with another tool (e.g. totalview).
    MPIEXEC_FLAGS_HOST   Flags that will be passed to the hosts.
    MPIEXEC_FLAGS_MIC    Flags that will be passed to the MICs.
  -- Examples:
       export MPIEXEC_PREFIX=\"totalview -args\"
       export MPIEXEC_PREFIX=\"totalviewcli -args\"
       export MPIEXEC_FLAGS_HOST=\"-env VAR VALUE\"
       export MPIEXEC_FLAGS_MIC=\"-envlist VAR1,VAR2\"

EXAMPLES
  Batch Script1 - Only hosts:
    ---
    #!/bin/bash
    #SBATCH -J TestJobMICNativeHybrid
    #SBATCH -N 4
    #SBATCH -p q_mics
    #SBATCH -o TestJob-%j.out
    #SBATCH -e TestJob-%j.err
    #SBATCH --time=30

    module purge
    module load impi intel/13.1.3

    export MIC_NUM_PER_HOST=0
    export OMP_NUM_THREADS=32

    mpirun-mic -x 1 -c ./impi_native_hybrid
    ---

  Batch Script2 - Only mics:
    ---
    #!/bin/bash
    #SBATCH -J TestJobMICNativeHybrid
    #SBATCH -N 4
    #SBATCH -p q_mics
    #SBATCH -o TestJob-%j.out
    #SBATCH -e TestJob-%j.err
    #SBATCH --time=30

    module purge
    module load impi intel/13.1.3

    export MIC_NUM_PER_HOST=2
    export MIC_OMP_NUM_THREADS=240

    mpirun-mic -z 1 -m ./impi_native_hybrid.mic
    ---

  Batch Script3 - Hosts and MICs:
    ---
    #!/bin/bash
    #SBATCH -J TestJobMICNativeHybrid
    #SBATCH -N 2
    #SBATCH -p q_mics
    #SBATCH -o TestJob-%j.out
    #SBATCH -e TestJob-%j.err
    #SBATCH --time=30

    module purge
    module load impi intel/13.1.3

    export MIC_NUM_PER_HOST=2
    export OMP_NUM_THREADS=2
    export MIC_OMP_NUM_THREADS=4

    mpirun-mic -v -x 16 -c ./impi_native_hybrid -z 60 -m ./impi_native_hybrid.mic
    ---
";

# check script arguments 
if [ $# -lt 1 ] ; then
   echo "$USAGE" >&2
   exit 1
fi

# get script arguments
while getopts "vhc:m:x:z:-:" OPTION
do
  case $OPTION in
    c)
      HOST_BINARY=$OPTARG
      ;;
    h)
      echo "$USAGE";
      exit 0;
      ;;
    m)
      MIC_BINARY=$OPTARG
      ;;
    v)
      MPIRUN_MIC_VERBOSE=1
      ;;
    x)
      HOST_PPN=$OPTARG
      ;;
    z)
      MIC_PPN=$OPTARG
      ;;
    -)
      case $OPTARG in
          tv)
              MPIEXEC_PREFIX="totalview -args"
              ;;
          tvcli)
              MPIEXEC_PREFIX="totalviewcli -args"
              ;;
          \?) echo $USAGE >&2
              exit 1
              ;;
      esac
      ;;
    \?)
      echo "$USAGE";
      exit 1;
      ;;
    esac
done


### prepare the environment
# If not under Slurm just run on the local system, but still we must be on a compute node..
if [[ -z "$SLURM_PROCID" ]] ; then
    SLURM_PROCID=0
fi
if [[ -z "$SLURM_NODELIST" ]] ; then
    SLURM_NODELIST=`hostname`
fi

# give default values
if [[ -z "$MIC_PPN" ]] ; then
    MIC_PPN=1
fi
if [[ -z "$HOST_PPN" ]] ; then
    HOST_PPN=1
fi

if [[ -z "$MIC_NUM_PER_HOST" ]] ; then
    MIC_NUM_PER_HOST=2
fi


# We will use OMP_NUM_THREADS to decide if the user will run a Hybrid MPI+OpenMP job
# Here set default value for MIC_OMP_NUM_THREADS
if [[ -n "$OMP_NUM_THREADS" ]] ; then
  if [[ -z "$MIC_OMP_NUM_THREADS" ]] ; then
    MIC_OMP_NUM_THREADS=$OMP_NUM_THREADS
  fi
fi

# check the important values
if [[ -z "$HOST_BINARY" ]] &&  [[ -z "$MIC_BINARY" ]] ; then
  echo "$USAGE" >&2
  exit 1;
fi

# create the command line
#MPI_EXEC=mpirun
MPI_EXEC=mpiexec.hydra
EXEC_ARGS=""

# create the list of the nodes that are configured to have MICs
LLIST_HOSTS_WITH_MICS="";
SLIST_HOSTS_WITH_MICS=`sinfo -h -o "%N %G" | grep mic | awk '{ print $1; }'`;
for host in `scontrol show hostname $SLIST_HOSTS_WITH_MICS` ; do
  LLIST_HOSTS_WITH_MICS="${LLIST_HOSTS_WITH_MICS} ${host}";
done

# create the lists of HOSTS AND MICS!
HOST_NODELIST="";
MIC_NODELIST="";
for host in `scontrol show hostname $SLURM_NODELIST` ; do
  echo $LLIST_HOSTS_WITH_MICS | grep $host  &> /dev/null
  if [ $? -eq 0 ] ; then
    if [ $MIC_NUM_PER_HOST -eq 1 ] ; then
      MIC_NODELIST="${MIC_NODELIST} ${host}-mic0";
    elif [ $MIC_NUM_PER_HOST -eq 2 ] ; then
      MIC_NODELIST="${MIC_NODELIST} ${host}-mic0 ${host}-mic1";
    fi
  fi
  HOST_NODELIST="${HOST_NODELIST} ${host}";
done


# create the arguments 
# args for hosts here
# run job on hosts if host binary is not null
if [[ -n "$HOST_BINARY" ]] ; then
  if [[ -n "$HOST_NODELIST" ]] ; then
    for n in $HOST_NODELIST ; do
      if [[ -n "$OMP_NUM_THREADS" ]] ; then
        # with OpenMP
        EXEC_ARGS="${EXEC_ARGS} : -env OMP_NUM_THREADS $OMP_NUM_THREADS $MPIEXEC_FLAGS_HOST -n $HOST_PPN -host $n $HOST_BINARY";
      else
        # without OpenMP
        EXEC_ARGS="${EXEC_ARGS} : $MPIEXEC_FLAGS_HOST -n $HOST_PPN -host $n $HOST_BINARY";
      fi
    done
  fi
fi
# args for mics here
# run job on mics if mic binary is not null and MIC_NUM_PER_HOST is 1 or 2
if [[ -n "$MIC_NODELIST" ]] ; then
  for n in $MIC_NODELIST ; do
    if [[ -n "$MIC_OMP_NUM_THREADS" ]] ; then
      # with OpenMP
      EXEC_ARGS="${EXEC_ARGS} : -env OMP_NUM_THREADS $MIC_OMP_NUM_THREADS -env LD_LIBRARY_PATH $MIC_LD_LIBRARY_PATH:$LD_LIBRARY_PATH $MPIEXEC_FLAGS_MIC -n $MIC_PPN -host $n $MIC_BINARY";
      #EXEC_ARGS="${EXEC_ARGS} : -env OMP_NUM_THREADS $MIC_OMP_NUM_THREADS $MPIEXEC_FLAGS_MIC -n $MIC_PPN -host $n $MIC_BINARY";
    else
      # NO OpenMP
      EXEC_ARGS="${EXEC_ARGS} : -env LD_LIBRARY_PATH $MIC_LD_LIBRARY_PATH:$LD_LIBRARY_PATH $MPIEXEC_FLAGS_MIC -n $MIC_PPN -host $n $MIC_BINARY";
      #EXEC_ARGS="${EXEC_ARGS} : $MPIEXEC_FLAGS_MIC -n $MIC_PPN -host $n $MIC_BINARY";
    fi
  done
fi

RUNCMD="$MPI_EXEC $EXEC_ARGS";

if [[ -n "$MPIEXEC_PREFIX" ]] ; then
    RUNCMD="$MPIEXEC_PREFIX $RUNCMD";
fi

# extra important env (Local System depended)
#export LD_LIBRARY_PATH="$MIC_LD_LIBRARY_PATH:$LD_LIBRARY_PATH"
export I_MPI_MIC=1
export I_MPI_DAPL_PROVIDER_LIST=ofa-v2-mlx4_0-1
unset I_MPI_DEVICE
unset I_MPI_PMI_LIBRARY

# start the job
if [ $SLURM_PROCID -eq 0 ] ; then

    if [[ -n "$MPIRUN_MIC_VERBOSE" ]] ; then
        echo
        echo "########################################################################"
        echo "MPI Tasks per host:          $HOST_PPN"
        echo "Threads per host MPI task:   $OMP_NUM_THREADS"
	echo "Binary for the hosts:        $HOST_BINARY"
        echo "MPI Tasks per MIC:           $MIC_PPN"
        echo "Threads per MIC MPI task:    $MIC_OMP_NUM_THREADS"
	echo "Binary for the mics:         $MIC_BINARY"
        echo "MIC_NUM_PER_HOST:            $MIC_NUM_PER_HOST"
        echo
        echo "MPIEXEC_PREFIX:              $MPIEXEC_PREFIX"
        echo "MPIEXEC_FLAGS_HOST:          $MPIEXEC_FLAGS_HOST"
        echo "MPIEXEC_FLAGS_MIC:           $MPIEXEC_FLAGS_MIC"
        echo ""
        echo "Run command: "
        echo "$RUNCMD"
        echo "########################################################################"
        echo 
    fi

    $RUNCMD

fi



